## Sourobh Ghosh, Casey Kearney, Mahdi Hashemian
## Gov 2001 Replication Project


require(readstata13)

setwd("~/Dropbox/Spring 2016 Courses/Gov 2001/Gov 2001 Replication Paper/Data")  ## modify this as necessary

data = read.dta13("SovietCollaboration_MainDataset.dta")
data.Table5 = read.dta13("SovietCollaboration_CitationToSovietArtDataset.dta")  ## for Table 5
data.spec = read.dta13("SovietCollaboration_SpecializationDataset.dta") ## for Fig 7, Table 6, Table A8-A9

## dropping Soviet authors
newd = data[-which(data$sovietauthor == 1), ]

##### robust cluster SEs function (http://www.r-bloggers.com/easy-clustered-standard-errors-in-r/)

get_CL_vcov<-function(model, cluster){
  require(sandwich)
  require(lmtest)
  
  #calculate degree of freedom adjustment
  M <- length(unique(cluster))
  N <- length(cluster)
  K <- model$rank
  dfc <- (M/(M-1))*((N-1)/(N-K))
  
  #calculate the uj's
  uj  <- apply(estfun(model),2, function(x) tapply(x, cluster, sum))
  
  #use sandwich to get the var-covar matrix
  vcovCL <- dfc*sandwich(model, meat=crossprod(uj)/N)
  return(vcovCL)
}


###### Top and bottom three subfields

## subsetting top 3 bottom 3 dataset (133498 obs)
newt3b3 = subset(newd, newbdrankingposition<=3 | newbdrankingposition>=31, 
                 select=c(logauthourcount, newbdrankingposition, afterdummy, adjustedmsc, year))

## creating dummy for topsoviet
topsovietdummy = rep(0,nrow(newt3b3))
for (i in 1:nrow(newt3b3)){  if (newt3b3$newbdrankingposition[i] < 4){topsovietdummy[i] = 1} }

t3b3 = cbind(newt3b3,topsovietdummy)
fit = lm(logauthourcount ~ topsovietdummy:afterdummy
         + factor(adjustedmsc) + factor(year), data = t3b3)  ## OLS with fixed effects 

## call robust cluster SE function and save the var-cov matrix output in an object
m1.vcovCL<-get_CL_vcov(fit, t3b3$adjustedmsc)

## results
coeftest(fit, m1.vcovCL)  ## coefficient on AfterIronCurtain x SovietRich and corresponding robust cluster SE
summary(fit)$adj.r.squared  ## R-squared


###### Top three subfields

topsovietdummy = rep(0,nrow(newd))
for (i in 1:nrow(newd)){  if (newd$newbdrankingposition[i] < 4){topsovietdummy[i] = 1} }

t3 = cbind(newd,topsovietdummy)
fit2 = lm(logauthourcount ~ topsovietdummy:afterdummy
          + factor(adjustedmsc) + factor(year), data = t3)

m2.vcovCL<-get_CL_vcov(fit2, t3$adjustedmsc)

coeftest(fit2, m2.vcovCL)
summary(fit2)$adj.r.squared


###### Top five subfields

topsovietdummy = rep(0,nrow(newd))
for (i in 1:nrow(newd)){  if (newd$newbdrankingposition[i] < 6){topsovietdummy[i] = 1} }

t5 = cbind(newd,topsovietdummy)
fit3 = lm(logauthourcount ~ topsovietdummy:afterdummy
          + factor(adjustedmsc) + factor(year), data = t5)

m3.vcovCL<-get_CL_vcov(fit3, t5$adjustedmsc)

coeftest(fit3, m3.vcovCL)
summary(fit3)$adj.r.squared


###### Top ten subfields

topsovietdummy = rep(0,nrow(newd))
for (i in 1:nrow(newd)){  if (newd$newbdrankingposition[i] < 11){topsovietdummy[i] = 1} }

t10 = cbind(newd,topsovietdummy)
fit4 = lm(logauthourcount ~ topsovietdummy:afterdummy
          + factor(adjustedmsc) + factor(year), data = t10)

m4.vcovCL<-get_CL_vcov(fit4, t10$adjustedmsc)

coeftest(fit4, m4.vcovCL)
summary(fit4)$adj.r.squared


###### continuous

fit5 = lm(logauthourcount ~ newbdaftersoviet
          + factor(adjustedmsc) + factor(year), data = newd)  ## newbdaftersoviet variable in original data set has interaction pre-coded

m5.vcovCL<-get_CL_vcov(fit5, newd$adjustedmsc)

coeftest(fit5, m5.vcovCL)  ## note the coefficient is negative relative to what is reported in paper since increasing the newbdaftersoviet score implies a decrease in ranking (opposite direction of reported coefficient)
summary(fit5)$adj.r.squared
